Stage 1

Evaluate the cell type labels (the “truth” for scANVI / scVI).

# edit and run this locally (laptop / etc)
~/git/scEiaD_quant/workflow/scripts/hand_change_ct_labels.R
# rsync the new cell label file to biowulf
rsync -Prav scEiaD_cell_labels_2024_08_27.csv.gz h2:/home/mcgaugheyd/git/scEiaD_quant/
# create the organism level h5ad file from the individual scEiaD_quant made h5ad files
# now on biowulf
# sinteractive --mem=128G
mamba deactivate; mamba activate rscvi
python ~/git/scEiaD_modeling/workflow/scripts/merge_adata.py hs111.fin.txt /home/mcgaugheyd/git/scEiaD_quant/sample_meta.scEiaD_v1.2024_02_28.01.tsv.gz /home/mcgaugheyd/git/scEiaD_quant/scEiaD_cell_labels_2024_08_27.csv.gz hs111.adata.solo.20240827.h5ad hs111.adata.solo.20240827.obs.csv.gz
# back to local computer to copy the cell metadata
cd ~/git/scEiaD_modeling
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/hs111.adata.solo.20240827.obs.csv.gz .
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 2: /Users/mcgaugheyd/git/scEiaD_quant/workflow/scripts/hand_change_ct_labels.R: Permission denied
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [sender]
rsync error: error in rsync protocol data stream (code 12) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [sender=2.6.9]
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 8: mamba: command not found
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 8: mamba: command not found
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 9: python: command not found
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]

Stage 2

  • pull data from one species
  • filter to age group (dev or mature)
  • select random (up to 2k) cell type per study and output those barcodes
  • output barcodes for the non-selected cells
  • run scvi on biowulf
library(tidyverse)
sample_meta <- data.table::fread('~/git/scEiaD_quant/sample_meta.scEiaD_v1.2024_02_28.01.tsv.gz')
cell_meta <- data.table::fread('~/data/scEiaD_modeling/hs111.adata.solo.20240827.obs.csv.gz')[,-1] %>% 
  relocate(barcode) %>% 
  filter(solo_doublet == "FALSE")

hs111_eye <- cell_meta %>% 
  filter(study_accession != 'SRP362101') %>% 
  mutate(stage = case_when(as.numeric(age) <= 10 ~ 'Developing', 
                           TRUE ~ 'Mature'), 
         side = case_when(tissue == 'Brain Choroid Plexus' ~ 'Brain Choroid Plexus',
                          grepl("Choroid|RPE", tissue) ~ 'eye',
                          grepl("Retina", tissue) ~ 'eye',
                          grepl("Outf", tissue) ~ 'FrontEye',
                          grepl("Iris", tissue) ~ 'FrontEye',
                          grepl("Sclera", tissue) ~ 'FrontEye',
                          grepl("Cornea", tissue) ~ 'FrontEye',
                          grepl("Macula", tissue) ~ 'eye',
                          grepl("Trabecul", tissue) ~ 'FrontEye',
                          grepl("Optic", tissue) ~ 'eye',
                          TRUE ~ tissue)) %>% 
  filter(organ == 'Eye', # 2024 09 03 oops
         organism == 'Homo sapiens',
         !grepl("^#", sample_accession),
         source == 'Tissue',
         #tissue %in% c("Macula", "Retina"),
         #side %in% c("FrontEye", "eye"),
         #side %in% c("eye"),# 2024 08 31
         #capture_type == 'cell', # 2024 08 28
         #kb_tech %in% c("10xv1","10xv2","10xv3"), # 2024 08 28
         stage == 'Mature')# %>% 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `stage = case_when(as.numeric(age) <= 10 ~ "Developing", TRUE ~ "Mature")`.
Caused by warning:
! NAs introduced by coercion
#filter(study_accession != 'SRP447468') # 2024 08 29 remove (for now????) this optic nerve and adjacent study from sanes as it clusters (hclust) apart from everything else
# do not use studies with less than 1000 cells as ref
srp362101_sra <- read_csv('~/git/scEiaD_quant/data/SRP362101_SraRunTable.txt')
Rows: 24 Columns: 32── Column specification ──────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (24): Run, Assay Type, BIOMATERIAL_PROVIDER, BioProject, BioSample, BioSampleModel, C...
dbl   (6): AGE, AvgSpotLen, Bases, Bytes, version, Sample Name
dttm  (2): ReleaseDate, create_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hs111_eye <- bind_rows(hs111_eye,
                       cell_meta %>% filter(sample_accession %in% srp362101_sra$Experiment)) # 2024 09 11
# do not use studies with less than 1000 cells as ref
hs111_study <- hs111_eye %>% 
  group_by(study_accession) %>% summarise(Count = n()) %>% filter(Count>1000)
set.seed(101294)
hs111_eye$MajorCellType %>% table()
.
                             amacrine                ape          astrocyte 
            433712              65766               8050              10726 
                 b               beam            bipolar   choriocapillaris 
              1932               2694              39673                748 
      ciliary body               cone             cornea corneal progenitor 
              1113               6552                169                  2 
       endothelial         epithelial         equatorial              fiber 
             13436              39731               2905               1929 
        fibroblast             goblet         horizontal             immune 
             38626                251               6965               4462 
               jct             limbal         lymphocyte         macrophage 
               462                 20                453               4950 
              mast         melanocyte          microglia           monocyte 
               136              19936                963                456 
           mueller             muscle               npce              oligo 
             26975              15450               4596               3051 
               opc                pce           pericyte                ppe 
              1814              13395               2710              14010 
         red blood   retinal ganglion                rod        rod bipolar 
               190              18380              98029               3959 
               rpc                rpe            schwann      smooth muscle 
                23               1566               8192               2551 
         sphincter               t/nk 
              1053               3217 
hs111_ref_bcs <- hs111_eye %>% 
  filter(study_accession %in% hs111_study$study_accession) %>% 
  filter(MajorCellType != '',   # 2024 04 16 big change - only use labelled cells in ref
         MajorCellType != 'rpc') %>%  # 2024 08 30 23 cells in 83 year old labelled as rpc??? 
  # 2024 07 30 fixed mistake in filtering that still let them in
  group_by(study_accession, MajorCellType) %>%  # 2024 07 30 big change - sample on study *and* majorcelltype
  sample_n(2000, replace = TRUE) %>%  
  unique()

hs111_query_bcs <- hs111_eye %>% 
  filter(!barcode %in% hs111_ref_bcs$barcode) 

#hs111_ref_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/hs111_mature_eye_ref_bcs.full.20240911-01.csv.gz'))
#hs111_query_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/hs111_mature_eye_query_bcs.full.20240911-01.csv.gz'))

run scVI

now go to biowulf2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye

# will take ~6-10 hours (depending on hpc usage)
mamba deactivate; mamba activate; bash ~/git/scEiaD_modeling/Snakemake.wrapper.sh ~/git/scEiaD_modeling/workflow/Snakefile ~/git/scEiaD_modeling/config/config_hs111_mature_eye_full.yaml ~/git/scEiaD_modeling/config/cluster.json
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8544c965f.txt: line 2: mamba: command not found
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8544c965f.txt: line 2: mamba: command not found
/Users/mcgaugheyd/git/scEiaD_modeling/Snakemake.wrapper.sh: line 16: snakemake: command not found

rsync output from biowulf2 to local computer

cd /Users/mcgaugheyd/data/scEiaD_modeling/hs111_mature_eye
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*scib* .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*obs* .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*.leiden3.csv.gz .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*hvg.csv.gz .
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]

Stage 3

scib scoring

scib_files <- list.files(path = '~/data/scEiaD_modeling/hs111_mature_eye_full/', full.names = TRUE) %>% grep('scib',.,value = TRUE)
scib_scores <- purrr::map(scib_files, read.csv)
names(scib_scores) <- scib_files %>% str_extract("\\d\\d\\d\\dhvg.*l") %>% gsub("^_","",.)
for (i in names(scib_scores)){
  scib_scores[[i]]$Delta <- as.numeric(scib_scores[[i]][2,'Total']) - as.numeric(scib_scores[[i]][1,'Total'])
}
bind_rows(scib_scores, .id = 'Params') %>% 
  filter(Embedding == 'X_scVI') %>% 
  mutate(Params= gsub("hvg|e|l","",Params)) %>% 
  separate(Params, into = c("HVG","Epochs", "Latent Dimensions"), sep = "_") %>% 
  mutate_all(as.numeric) %>% 
  mutate(Embedding = 'X_scVI') %>% 
  arrange(-Total)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Embedding = .Primitive("as.double")(Embedding)`.
Caused by warning:
! NAs introduced by coercion

Assess Output

library(tidyverse)
source('analysis_scripts.R')

obs <- pull_obs('~/data/scEiaD_modeling/hs111_mature_eye_full//hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.obs.csv.gz')
`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.
swarooks_markers <- read_csv('../data/MBrooks313_scRNAseq_Retina_Cell_type_MB_v2_positive.csv')
Rows: 257 Columns: 3── Column specification ──────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): cell_type, sub_cell_type, gene
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diff <- pull_diff("~/data/scEiaD_modeling/hs111_mature_eye_full/hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.difftesting.leiden3.csv.gz")

'select()' returned 1:many mapping between keys and columns
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
obs$labels <- obs$labels %>% 
  left_join(diff$top_diff)
Joining with `by = join_by(leiden3)`

Cluster Overview

Ratio (percentage) of labelled cell types for each leiden3 cluster

obs$labels %>% 
  arrange(aMCT) %>% 
  mutate(leiden3 = as.factor(leiden3)) %>% 
  DT::datatable(filter = 'top')
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

background fraction against solo (doublet) score

Labelled points are clusters with more than one machine CT (above 0.1)

obs$labels %>% 
  mutate(mMCT_unique_count = str_count(mMCT, ',') + 1) %>%  
  ggplot(aes(x=solo_score,y=background_fraction)) + 
  geom_point(aes(color = as.factor(mMCT_unique_count), size = log1p(TotalCount))) + 
  ggrepel::geom_label_repel(data = . %>% filter(mMCT_unique_count > 1), aes(label = leiden3), max.overlaps = Inf)

Discrepancies

Between author cell type and machine (scANVI) cell type

obs$labels %>% filter(aMCT != mMCT) %>% data.frame

Multiple mMCT above 10% of the total in a cluster

obs$labels %>% filter(grepl(",",mMCT)) %>% select(-solo_score, -background_fraction, -cell_probability) %>% data.frame %>% arrange(-TotalCount)

UMAP Plots

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = MCT_scANVI), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(MCT_scANVI) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = MCT_scANVI, color = MCT_scANVI)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none")


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = mCT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = mCT, color = mCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none")



obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = mMCT), pointsize = 4, alpha = 0.5) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") + 
  facet_wrap(~study_accession)




obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = capture_type)) +
  facet_wrap(~MCT_scANVI) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot()


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(grepl("bipolar", mMCT)) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  ggtitle("bipolar") +
  cowplot::theme_cowplot() 


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'rpe') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = mCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() 


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'amacrine') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot()+
  ggtitle("amacrine")


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'amacrine') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = mCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() 


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mCT == 'mueller') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + ggtitle("mueller")


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mCT == 'rod') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + ggtitle("rod")

hclust

Take pseudobulk values (at the cluster level) and hierarchically cluster them to ensure there aren’t any issues in either the overall structure (e.g. rod and cones are intersperse)d and/or to identify any potential mislabeled clusters

library(tidyverse)
pb <- data.table::fread('~/data/scEiaD_modeling/hs111_mature_eye_full/hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.pseudoBulk.leiden3.csv.gz')
colnames(pb) <- gsub("\\.\\d+","",colnames(pb))
hvg <- data.table::fread('~/data/scEiaD_modeling/hs111_mature_eye_full/hvg2000.csv.gz')[-1,]
rnames <- pb$V1
clust <- str_extract(rnames, '\\d+') %>% as.integer()
pb <- pb[,-1] %>% as.matrix()
row.names(pb) <- as.character(clust)
pb <- pb[as.character(obs$labels$leiden3),]

# library(SingleCellExperiment)
# library(scater)
# library(scran)
# do_hvg <- function(cts){
#   sce <- SingleCellExperiment(list(counts = cts))
#   sce <- logNormCounts(sce)
#   hvg <- modelGeneVar(sce)
#   # Visualizing the fit:
#   hvg$var <- metadata(hvg)$var
#   return(hvg)
# }
# 
# nhvg  <- do_hvg(t(pb))

pb_norm <- metamoRph::normalize_data(t(pb), sample_scale = 'cpm') %>% t() 
Sample CPM scaling
log1p scaling
# remove cell cycle genes
conv_table <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db, 
                                    keys=gsub('\\.\\d+','',unique(colnames(pb_norm))),
                                    columns=c("ENSEMBL","SYMBOL", "MAP","GENENAME", "ENTREZID"), keytype="ENSEMBL")
'select()' returned 1:many mapping between keys and columns
cc_genes <- hvg %>% mutate(ENSEMBL = gsub("\\.\\d+","",V2)) %>% 
  left_join(conv_table, by = "ENSEMBL") %>% 
  mutate(cc_genes = case_when(SYMBOL %in% (Seurat::cc.genes.updated.2019 %>% unlist()) ~ TRUE)) %>% 
  filter(cc_genes) %>% pull(V2)
ribo_genes <- hvg %>% mutate(ENSEMBL = gsub("\\.\\d+","",V2)) %>% 
  left_join(conv_table, by = "ENSEMBL") %>% filter(grepl("^RPL|^RPS|^MT",SYMBOL)) %>% 
  pull(SYMBOL)

pb_norm <- pb_norm[,hvg$V2]
#pb_norm <- pb_norm[,hvg$V2[!hvg$V2 %in% c(cc_genes,ribo_genes)]]
# https://stats.stackexchange.com/questions/31565/compute-a-cosine-dissimilarity-matrix-in-r
sim <- pb_norm / sqrt(rowSums(pb_norm * pb_norm))
sim <- sim %*% t(sim)
D_sim <- as.dist(1 - sim)
Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
Also defined by ‘spam’
hclust_sim <- hclust(D_sim, method = 'average')

hclust_sim$labels <- obs$labels %>% pull(leiden3)

library(ggtree)
ggtree v3.12.0 For help: https://yulab-smu.top/treedata-book/

If you use the ggtree package suite in published research, please cite the
appropriate paper(s):

Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R
package for visualization and annotation of phylogenetic trees with their
covariates and other associated data. Methods in Ecology and Evolution. 2017,
8(1):28-36. doi:10.1111/2041-210X.12628

S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L Zhan, T Wu, E
Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact visualization of richly annotated
phylogenetic data. Molecular Biology and Evolution. 2021, 38(9):4039-4042. doi:
10.1093/molbev/msab166

Guangchuang Yu.  Data Integration, Manipulation and Visualization of Phylogenetic
Trees (1st edition). Chapman and Hall/CRC. 2022, doi:10.1201/9781003279242 

Attaching package: ‘ggtree’

The following object is masked from ‘package:tidyr’:

    expand
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels, by = c("label" = "leiden3")) %>% 
  mutate(techRatio = round(techRatio, digits = ))
p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studyCount, TotalCount, techRatio, sep = ' - '), color = mCT)) + 
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")


p <- ggtree(hclust_sim)
p$data <- p$data %>% 
  left_join(obs$labels %>% 
              mutate(studies = case_when(studyCount ==1 ~ studies,
                                         TRUE ~ "multiple")), by = c("label" = "leiden3")) %>% 
  mutate(class = case_when(mCT %in% c("rod","cone","retinal ganglion",
                                      "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural"))
p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = class)) + 
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")



p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels %>% mutate(studies = case_when(studyCount ==1 ~ studies,
                                                                         TRUE ~ "multiple")), by = c("label" = "leiden3")) 

p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = mCT)) + 
  geom_tippoint(aes(shape = studies), size= 3) +
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")




p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels %>% mutate(studies = case_when(studyCount ==1 ~ studies,
                                                                         TRUE ~ "multiple")), by = c("label" = "leiden3")) 

p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = mCT)) + 
  geom_tippoint(aes(shape = studies), size= 3) +
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")

NA
NA

cutree

hclust 2 and 3 are non-neural

1 is neural

cutree_with_labels <- cutree(hclust_sim, k = 3) %>% enframe(name = 'leiden3', value = 'hclust_k3') %>% 
  mutate(leiden3 = as.integer(leiden3)) %>% 
  left_join(obs$labels) %>%   
  mutate(class = case_when(mCT %in% c("rod","cone","retinal ganglion",
                                      "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural")) 
Joining with `by = join_by(leiden3)`
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(cutree_with_labels, by = c("label" = "leiden3"))

p + layout_dendrogram() +
  geom_tiplab(aes(color = as.factor(hclust_k3))) + 
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet(), pals::glasbey()) %>% unname())



cutree_with_labels %>% 
  group_by(hclust_k3, class) %>% 
  summarise(Count = n(), 
            leiden3 = paste(leiden3, collapse = ', '))
`summarise()` has grouped output by 'hclust_k3'. You can override using the `.groups` argument.

Remove Clusters

Reasons: - non-neural in hclust 1 - neural in hclust 2 or 3 - SRP447468 “exclusive” neural clusters (as they seem to group with themselves instead of their respective cluster types)

remove_leiden3 <- #c(71,93,126,112,144,39,21,85,135,7,74,36,124)
  # reason 1
  c(cutree_with_labels %>% filter(class == 'non-neural', hclust_k3 == 1) %>% pull(leiden3), 
    # reason 2
    cutree_with_labels %>% filter(class == 'neural', hclust_k3 %in% c(2,3)) %>% pull(leiden3),
    # reason 3
    c(153,131,150,62,4,100)
  )

Matthew Brooks / Anand Swaroop Diff Markers (retina)

Another sanity check for some cell types

labelled_diff_testing <- diff$diff_testing %>% 
  left_join(conv_table, by =c( "ENSEMBL")) %>% filter(SYMBOL %in% toupper(swarooks_markers$gene))
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
scored_labelled_diff_testing <- labelled_diff_testing %>% 
  left_join(swarooks_markers %>% mutate(gene = toupper(gene)), by = c("SYMBOL" = "gene")) %>% 
  group_by(base, cell_type) %>% 
  summarise(score = mean(scores)) 
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.`summarise()` has grouped output by 'base'. You can override using the `.groups` argument.
scored_labelled_diff_testing %>% 
  pivot_wider(names_from = cell_type, values_from = score) %>% 
  dplyr::rename(leiden3 = base) %>% left_join(obs$labels %>% select(leiden3, mMCT)) %>% mutate(leiden3 = as.factor(leiden3)) %>%  DT::datatable(filter = 'top')
Joining with `by = join_by(leiden3)`

Stage 4

Run the scEiaD_modeling again on the neural and non-neural CLUSTERS

Neural

set.seed(1364578)
neural_bcs <- obs$obs %>% 
  left_join(cutree_with_labels, by = 'leiden3') %>% 
  filter(!leiden3 %in% remove_leiden3) %>% 
  mutate(class = case_when(MCT_scANVI %in% c("rod","cone","retinal ganglion",
                                             "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural")) %>% 
  filter(class == 'neural')


neural_ref <- neural_bcs %>% 
  ungroup() %>% 
  group_by(study_accession, MCT_scANVI) %>%  
  #filter(tissue %in% c('Retina')) %>% 
  sample_n(2000, replace = TRUE) %>% 
  unique()

low_study_n <- neural_ref %>% group_by(study_accession) %>% count() %>% filter(n < 100)
neural_ref <- neural_ref %>% ungroup() %>% 
  filter(!study_accession %in% low_study_n$study_accession)

neural_query <- neural_bcs %>% 
  filter(!barcode %in% neural_ref$barcode)


#neural_ref$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_neural_ref_bcs.20240924-02.csv.gz'))
#neural_query$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_neural_query_bcs.20240924-02.csv.gz'))

Follow up in Human_Mature_Eye_full__stage4_neural.Rmd

Non-Neural

set.seed(1364578)
nonneural_bcs <- obs$obs %>% 
  left_join(cutree_with_labels, by = 'leiden3') %>% 
  filter(!leiden3 %in% remove_leiden3) %>% 
  mutate(class = case_when(MCT_scANVI %in% c("rod","cone","retinal ganglion",
                                             "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural")) %>% 
  filter(class == 'non-neural')


nonneural_ref <- nonneural_bcs %>% 
  ungroup() %>% 
  group_by(study_accession, MCT_scANVI) %>%  
  #filter(tissue %in% c('Retina')) %>% 
  sample_n(2000, replace = TRUE) %>% 
  unique()

low_study_n <- nonneural_ref %>% group_by(study_accession) %>% count() %>% filter(n < 100)

# remove non author labelled cells from SRP447468 as a ref
# this study keeps being an outlier - just want to use cells which passed their QC
nonneural_ref <- bind_rows(nonneural_ref %>% ungroup() %>% 
                             filter(!study_accession %in% low_study_n$study_accession,
                                    study_accession != 'SRP447468'),
                           nonneural_ref %>% ungroup() %>% 
                             filter(study_accession == 'SRP447468',
                                    MajorCellType != 'unlabelled')
)

nonneural_query <- nonneural_bcs %>% 
  filter(!barcode %in% nonneural_ref$barcode)


#nonneural_ref$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_NONneural_ref_bcs.20241001-02.csv.gz'))
#nonneural_query$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_NONneural_query_bcs.20241001-02.csv.gz'))

Follow up in Human_Mature_Eye_full__stage4_NONneural.Rmd

Stage 5

Load in new CT calls (made by hand in the __stage4_ docs)

load("Human_Mature_Eye_full__stage4_NONneural.obs.freeze20241112.Rdata")
load("Human_Mature_Eye_full__stage4_neural.obs.freeze20241112.Rdata")
obs$nobs <- bind_rows(obs_nonneural$nobs %>% 
                        mutate(CT_l3=  paste0(CT, ":", leiden3),
                               hrCT_l3 = paste0(hrCT, ":", leiden3)) %>% 
                        select(barcode, CT, hrCT,CT_l3, hrCT_l3, suspect) %>% 
                        mutate(origin = 'nonneural') %>% 
                        left_join(obs$obs, by = 'barcode'),
                      obs_neural$nobs %>% 
                        mutate(CT_l3=  paste0(CT, ":", leiden3),
                               hrCT_l3 = paste0(hrCT, ":", leiden3)) %>% 
                        select(barcode, CT, hrCT,CT_l3, hrCT_l3, suspect) %>% 
                        mutate(origin = 'neural') %>% 
                        left_join(obs$obs, by = 'barcode'))
obs$nlabels <- bind_rows(obs_nonneural$nlabels %>% mutate(leiden3 = paste0(leiden3, ':nn')),
                         obs_neural$nlabels %>% mutate(leiden3 = paste0(leiden3, ':ne'))) %>% select(-newCT) %>% 
  relocate(leiden3, CT, hrCT)

# label leiden3 with >90% suspect labels
sus_stage3_l3 <- obs$nobs %>% 
  group_by(leiden3, suspect) %>% 
  summarise(Count = n()) %>% 
  mutate(Ratio = Count/sum(Count)) %>% 
  filter(suspect) %>% 
  arrange(-Ratio) %>% 
  filter(Ratio > 0.9)
`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.
obs$nobs <- obs$nobs %>% mutate(suspect2 = case_when(leiden3 %in% sus_stage3_l3$leiden3 ~ TRUE,
                                                     suspect ~ TRUE,
                                                     !suspect ~ FALSE)) %>% 
  mutate(suspect = suspect2) %>% 
  select(-suspect2)
obs$nobs %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = CT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(CT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = CT, color = CT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") +
  facet_wrap(~suspect)


obs$nobs %>% 
  filter(!suspect) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = hrCT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(hrCT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = hrCT, color = hrCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") 


obs$nobs %>% 
  filter(!suspect) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = CT),pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(hrCT_l3, CT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = hrCT_l3,color = CT),
                            max.overlaps = Inf) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey(), pals::alphabet(), pals::watlington(),
                                pals::trubetskoy(), pals::tableau20(), pals::stepped(),
                                pals::polychrome(), pals::okabe()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") 


obs$nobs %>%
  filter(!suspect) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = hrCT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(CT, hrCT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = hrCT, color = hrCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") +
  facet_wrap(~CT)

Counts

mct <- obs$obs %>% 
  group_by(MajorCellType, study_accession, tissue) %>% 
  filter(MajorCellType != '', MajorCellType != 'unlabelled', MajorCellType != 'rpc') %>% 
  summarize(Count = n()) %>% 
  rename(CT = MajorCellType) %>% 
  mutate(CT = case_when(CT == 'oligo' ~ 'oligodendrocyte',
                        CT == 'sphincter' ~ 'muscle',
                        CT == 'fiber' ~ 'fibroblast',
                        TRUE ~ CT)) %>% 
  mutate(origin = case_when(CT %in% c("amacrine",
                                      "bipolar",
                                      "rod",
                                      "cone",
                                      "horizontal",
                                      "retinal ganglion") ~ 'neural',
                            TRUE ~ 'nonneural')) %>% 
  mutate(Class = 'Author Major Cell Type')
`summarise()` has grouped output by 'MajorCellType', 'study_accession'. You can override using the `.groups` argument.
mMCT <- obs$nobs %>% 
  filter(!suspect) %>% 
  group_by(CT, study_accession, tissue, origin) %>% 
  summarize(Count = n()) %>% 
  mutate(Class = 'ML Major Cell Type')
`summarise()` has grouped output by 'CT', 'study_accession', 'tissue'. You can override using the `.groups` argument.
bind_rows(mct,
          mMCT) %>% 
  filter(origin == 'neural') %>% 
  ggplot(aes(y=CT,x=Count,fill=study_accession)) +
  geom_bar(stat='identity') +
  ggforce::facet_row(~Class, scales = 'free') +
  cowplot::theme_cowplot() +
  scale_fill_manual(values = pals::kelly()[2:20])


bind_rows(mct,
          mMCT) %>% 
  filter(origin == 'nonneural') %>% 
  ggplot(aes(y=CT,x=Count,fill=study_accession)) +
  geom_bar(stat='identity') +
  ggforce::facet_row(~Class) +
  cowplot::theme_cowplot() +
  scale_fill_manual(values = pals::kelly()[2:20])

Outputs

# save(obs, file = 'Human_Mature_BackEye.freeze20241112.obs.Rdata')
# obs$nobs %>% write_csv("Human_Mature_BackEye.freeze20241112.nobs.csv.gz")
# obs$nlabels %>% write_csv("Human_Mature_BackEye.freeze20241112.nlabels.csv.gz")

Stage 6

Run mouse and chick (adult / not fetal) data against the Stage 4 nonneural and neural scANVI models.

Pondering how to best do this….

Parameters used (Stage 2 ran a bunch, in Stage 3 picked this one): 2000hvg 200e 30l

I have three models (on biowulf2):

  • Stage 2 (all)
  • Stage 4 (nonneural and neural)

Three approaches for new data: 1. Run against the Stage 2 model - downsides: - contains a lot of (later) removed cells (or is this an upside?) - built against

sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.7.1

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggtree_3.12.0   lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
 [6] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.1  
[11] tidyverse_2.0.0

loaded via a namespace (and not attached):
  [1] fs_1.6.4                    matrixStats_1.3.0           spatstat.sparse_3.1-0      
  [4] httr_1.4.7                  RColorBrewer_1.1-3          tools_4.4.1                
  [7] sctransform_0.4.1           utf8_1.2.4                  R6_2.5.1                   
 [10] DT_0.33                     lazyeval_0.2.2              uwot_0.2.2                 
 [13] withr_3.0.0                 sp_2.1-4                    gridExtra_2.3              
 [16] progressr_0.14.0            cli_3.6.3                   Biobase_2.64.0             
 [19] spatstat.explore_3.3-1      fastDummies_1.7.3           labeling_0.4.3             
 [22] sass_0.4.9                  Seurat_5.1.0                spatstat.data_3.1-2        
 [25] ggridges_0.5.6              pbapply_1.7-2               yulab.utils_0.1.5          
 [28] R.utils_2.12.3              dichromat_2.0-0.1           parallelly_1.38.0          
 [31] maps_3.4.2                  limma_3.60.4                rstudioapi_0.16.0          
 [34] RSQLite_2.3.7               pals_1.9                    generics_0.1.3             
 [37] gridGraphics_0.5-1          ica_1.0-3                   spatstat.random_3.3-1      
 [40] crosstalk_1.2.1             vroom_1.6.5                 Matrix_1.7-0               
 [43] fansi_1.0.6                 S4Vectors_0.42.1            abind_1.4-5                
 [46] R.methodsS3_1.8.2           lifecycle_1.0.4             yaml_2.3.10                
 [49] edgeR_4.2.1                 SummarizedExperiment_1.34.0 SparseArray_1.4.8          
 [52] Rtsne_0.17                  grid_4.4.1                  blob_1.2.4                 
 [55] promises_1.3.0              dqrng_0.4.1                 crayon_1.5.3               
 [58] miniUI_0.1.1.1              lattice_0.22-6              beachmat_2.20.0            
 [61] cowplot_1.1.3               KEGGREST_1.44.1             mapproj_1.2.11             
 [64] pillar_1.9.0                knitr_1.48                  metapod_1.12.0             
 [67] GenomicRanges_1.56.1        future.apply_1.11.2         codetools_0.2-20           
 [70] leiden_0.4.3.1              glue_1.7.0                  ggfun_0.1.5                
 [73] spatstat.univar_3.0-0       data.table_1.16.99          vctrs_0.6.5                
 [76] png_0.1-8                   treeio_1.28.0               spam_2.10-0                
 [79] gtable_0.3.5                cachem_1.1.0                xfun_0.48                  
 [82] S4Arrays_1.4.1              mime_0.12                   survival_3.7-0             
 [85] SingleCellExperiment_1.26.0 statmod_1.5.0               bluster_1.14.0             
 [88] fitdistrplus_1.2-1          ROCR_1.0-11                 nlme_3.1-165               
 [91] bit64_4.0.5                 RcppAnnoy_0.0.22            GenomeInfoDb_1.40.1        
 [94] bslib_0.8.0                 irlba_2.3.5.1               KernSmooth_2.23-24         
 [97] colorspace_2.1-1            BiocGenerics_0.50.0         DBI_1.2.3                  
[100] tidyselect_1.2.1            bit_4.0.5                   compiler_4.4.1             
[103] BiocNeighbors_1.22.0        DelayedArray_0.30.1         plotly_4.10.4              
[106] scales_1.3.0                lmtest_0.9-40               digest_0.6.36              
[109] goftest_1.2-3               spatstat.utils_3.0-5        rmarkdown_2.27             
[112] XVector_0.44.0              htmltools_0.5.8.1           pkgconfig_2.0.3            
[115] sparseMatrixStats_1.16.0    MatrixGenerics_1.16.0       fastmap_1.2.0              
[118] rlang_1.1.4                 htmlwidgets_1.6.4           UCSC.utils_1.0.0           
[121] shiny_1.9.0                 DelayedMatrixStats_1.26.0   farver_2.1.2               
[124] jquerylib_0.1.4             zoo_1.8-12                  jsonlite_1.8.8             
[127] BiocParallel_1.38.0         R.oo_1.26.0                 BiocSingular_1.20.0        
[130] magrittr_2.0.3              scuttle_1.14.0              GenomeInfoDbData_1.2.12    
[133] ggplotify_0.1.2             dotCall64_1.1-1             patchwork_1.2.0            
[136] munsell_0.5.1               Rcpp_1.0.13                 ape_5.8                    
[139] reticulate_1.38.0           stringi_1.8.4               zlibbioc_1.50.0            
[142] MASS_7.3-61                 plyr_1.8.9                  org.Hs.eg.db_3.19.1        
[145] parallel_4.4.1              listenv_0.9.1               ggrepel_0.9.5              
[148] deldir_2.0-4                Biostrings_2.72.1           splines_4.4.1              
[151] tensor_1.5                  hms_1.1.3                   locfit_1.5-9.10            
[154] igraph_2.0.3                spatstat.geom_3.3-2         RcppHNSW_0.6.0             
[157] metamoRph_0.2.1             reshape2_1.4.4              stats4_4.4.1               
[160] ScaledMatrix_1.12.0         evaluate_0.24.0             SeuratObject_5.0.2         
[163] scran_1.32.0                tweenr_2.0.3                tzdb_0.4.0                 
[166] httpuv_1.6.15               RANN_2.6.1                  polyclip_1.10-7            
[169] future_1.34.0               scattermore_1.2             ggforce_0.4.2              
[172] rsvd_1.0.5                  xtable_1.8-4                RSpectra_0.16-2            
[175] tidytree_0.4.6              later_1.3.2                 viridisLite_0.4.2          
[178] aplot_0.2.3                 memoise_2.0.1               AnnotationDbi_1.66.0       
[181] IRanges_2.38.1              cluster_2.1.6               timechange_0.3.0           
[184] globals_0.16.3             
---
title: "Human Mature Eye Creation"
output:
 html_notebook:
  author: "David McGaughey"
  date: "`r Sys.Date()`"
  theme: flatly
  toc: true
  toc_float: true
  code_folding: show
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  message = FALSE,  warning = FALSE,
  collapse = TRUE,
  fig.width = 12, fig.height = 8,
  comment = "#>",
  dpi=300
)
```

# Stage 1

Evaluate the cell type labels (the "truth" for scANVI / scVI). 

```{bash, exec = FALSE}
# edit and run this locally (laptop / etc)
~/git/scEiaD_quant/workflow/scripts/hand_change_ct_labels.R
# rsync the new cell label file to biowulf
rsync -Prav scEiaD_cell_labels_2024_08_27.csv.gz h2:/home/mcgaugheyd/git/scEiaD_quant/
# create the organism level h5ad file from the individual scEiaD_quant made h5ad files
# now on biowulf
# sinteractive --mem=128G
mamba deactivate; mamba activate rscvi
python ~/git/scEiaD_modeling/workflow/scripts/merge_adata.py hs111.fin.txt /home/mcgaugheyd/git/scEiaD_quant/sample_meta.scEiaD_v1.2024_02_28.01.tsv.gz /home/mcgaugheyd/git/scEiaD_quant/scEiaD_cell_labels_2024_08_27.csv.gz hs111.adata.solo.20240827.h5ad hs111.adata.solo.20240827.obs.csv.gz
# back to local computer to copy the cell metadata
cd ~/git/scEiaD_modeling
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/hs111.adata.solo.20240827.obs.csv.gz .
```

# Stage 2

- pull data from one species
- filter to age group (dev or mature)
- select random (up to 2k) cell type per study and output those barcodes
- output barcodes for the non-selected cells
- run scvi on biowulf


```{r, exec = FALSE}
library(tidyverse)
sample_meta <- data.table::fread('~/git/scEiaD_quant/sample_meta.scEiaD_v1.2024_02_28.01.tsv.gz')
cell_meta <- data.table::fread('~/data/scEiaD_modeling/hs111.adata.solo.20240827.obs.csv.gz')[,-1] %>% 
  relocate(barcode) %>% 
  filter(solo_doublet == "FALSE")

hs111_eye <- cell_meta %>% 
  filter(study_accession != 'SRP362101') %>% 
  mutate(stage = case_when(as.numeric(age) <= 10 ~ 'Developing', 
                           TRUE ~ 'Mature'), 
         side = case_when(tissue == 'Brain Choroid Plexus' ~ 'Brain Choroid Plexus',
                          grepl("Choroid|RPE", tissue) ~ 'eye',
                          grepl("Retina", tissue) ~ 'eye',
                          grepl("Outf", tissue) ~ 'FrontEye',
                          grepl("Iris", tissue) ~ 'FrontEye',
                          grepl("Sclera", tissue) ~ 'FrontEye',
                          grepl("Cornea", tissue) ~ 'FrontEye',
                          grepl("Macula", tissue) ~ 'eye',
                          grepl("Trabecul", tissue) ~ 'FrontEye',
                          grepl("Optic", tissue) ~ 'eye',
                          TRUE ~ tissue)) %>% 
  filter(organ == 'Eye', # 2024 09 03 oops
         organism == 'Homo sapiens',
         !grepl("^#", sample_accession),
         source == 'Tissue',
         #tissue %in% c("Macula", "Retina"),
         #side %in% c("FrontEye", "eye"),
         #side %in% c("eye"),# 2024 08 31
         #capture_type == 'cell', # 2024 08 28
         #kb_tech %in% c("10xv1","10xv2","10xv3"), # 2024 08 28
         stage == 'Mature')# %>% 
#filter(study_accession != 'SRP447468') # 2024 08 29 remove (for now????) this optic nerve and adjacent study from sanes as it clusters (hclust) apart from everything else
# do not use studies with less than 1000 cells as ref
srp362101_sra <- read_csv('~/git/scEiaD_quant/data/SRP362101_SraRunTable.txt')
hs111_eye <- bind_rows(hs111_eye,
                       cell_meta %>% filter(sample_accession %in% srp362101_sra$Experiment)) # 2024 09 11
# do not use studies with less than 1000 cells as ref
hs111_study <- hs111_eye %>% 
  group_by(study_accession) %>% summarise(Count = n()) %>% filter(Count>1000)
set.seed(101294)
hs111_eye$MajorCellType %>% table()
hs111_ref_bcs <- hs111_eye %>% 
  filter(study_accession %in% hs111_study$study_accession) %>% 
  filter(MajorCellType != '',   # 2024 04 16 big change - only use labelled cells in ref
         MajorCellType != 'rpc') %>%  # 2024 08 30 23 cells in 83 year old labelled as rpc??? 
  # 2024 07 30 fixed mistake in filtering that still let them in
  group_by(study_accession, MajorCellType) %>%  # 2024 07 30 big change - sample on study *and* majorcelltype
  sample_n(2000, replace = TRUE) %>%  
  unique()

hs111_query_bcs <- hs111_eye %>% 
  filter(!barcode %in% hs111_ref_bcs$barcode) 

#hs111_ref_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/hs111_mature_eye_ref_bcs.full.20240911-01.csv.gz'))
#hs111_query_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/hs111_mature_eye_query_bcs.full.20240911-01.csv.gz'))

```

## run scVI

now go to biowulf2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye

```{bash, exec = FALSE}
# will take ~6-10 hours (depending on hpc usage)
mamba deactivate; mamba activate; bash ~/git/scEiaD_modeling/Snakemake.wrapper.sh ~/git/scEiaD_modeling/workflow/Snakefile ~/git/scEiaD_modeling/config/config_hs111_mature_eye_full.yaml ~/git/scEiaD_modeling/config/cluster.json
```

## rsync output from biowulf2 to local computer
```{bash, exec = FALSE}
cd /Users/mcgaugheyd/data/scEiaD_modeling/hs111_mature_eye
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*scib* .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*obs* .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*.leiden3.csv.gz .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*hvg.csv.gz .
```

# Stage 3

## scib scoring
```{r}
scib_files <- list.files(path = '~/data/scEiaD_modeling/hs111_mature_eye_full/', full.names = TRUE) %>% grep('scib',.,value = TRUE)
scib_scores <- purrr::map(scib_files, read.csv)
names(scib_scores) <- scib_files %>% str_extract("\\d\\d\\d\\dhvg.*l") %>% gsub("^_","",.)
for (i in names(scib_scores)){
  scib_scores[[i]]$Delta <- as.numeric(scib_scores[[i]][2,'Total']) - as.numeric(scib_scores[[i]][1,'Total'])
}
bind_rows(scib_scores, .id = 'Params') %>% 
  filter(Embedding == 'X_scVI') %>% 
  mutate(Params= gsub("hvg|e|l","",Params)) %>% 
  separate(Params, into = c("HVG","Epochs", "Latent Dimensions"), sep = "_") %>% 
  mutate_all(as.numeric) %>% 
  mutate(Embedding = 'X_scVI') %>% 
  arrange(-Total)
```



## Assess Output
```{r}
library(tidyverse)
source('analysis_scripts.R')

obs <- pull_obs('~/data/scEiaD_modeling/hs111_mature_eye_full//hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.obs.csv.gz')

```

```{r diff}
swarooks_markers <- read_csv('../data/MBrooks313_scRNAseq_Retina_Cell_type_MB_v2_positive.csv')

diff <- pull_diff("~/data/scEiaD_modeling/hs111_mature_eye_full/hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.difftesting.leiden3.csv.gz")

obs$labels <- obs$labels %>% 
  left_join(diff$top_diff)
```




## Cluster Overview

### Ratio (percentage) of labelled cell types for each leiden3 cluster
```{r}
obs$labels %>% 
  arrange(aMCT) %>% 
  mutate(leiden3 = as.factor(leiden3)) %>% 
  DT::datatable(filter = 'top')
```



### background fraction against solo (doublet) score
Labelled points are clusters with more than one machine CT (above 0.1)
```{r}
obs$labels %>% 
  mutate(mMCT_unique_count = str_count(mMCT, ',') + 1) %>%  
  ggplot(aes(x=solo_score,y=background_fraction)) + 
  geom_point(aes(color = as.factor(mMCT_unique_count), size = log1p(TotalCount))) + 
  ggrepel::geom_label_repel(data = . %>% filter(mMCT_unique_count > 1), aes(label = leiden3), max.overlaps = Inf)
```


### Discrepancies
Between author cell type and machine (scANVI) cell type
```{r}
obs$labels %>% filter(aMCT != mMCT) %>% data.frame
```

Multiple mMCT above 10% of the total in a cluster
```{r}
obs$labels %>% filter(grepl(",",mMCT)) %>% select(-solo_score, -background_fraction, -cell_probability) %>% data.frame %>% arrange(-TotalCount)
```

## UMAP Plots
```{r, fig.width=16, fig.height=16}
obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = MCT_scANVI), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(MCT_scANVI) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = MCT_scANVI, color = MCT_scANVI)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none")

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = mCT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = mCT, color = mCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none")


obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = mMCT), pointsize = 4, alpha = 0.5) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") + 
  facet_wrap(~study_accession)



obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = capture_type)) +
  facet_wrap(~MCT_scANVI) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot()

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(grepl("bipolar", mMCT)) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  ggtitle("bipolar") +
  cowplot::theme_cowplot() 

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'rpe') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = mCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() 

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'amacrine') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot()+
  ggtitle("amacrine")

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'amacrine') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = mCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() 

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mCT == 'mueller') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + ggtitle("mueller")

obs$obs %>% 
  left_join(obs$labels, by = 'leiden3') %>% filter(mCT == 'rod') %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
  ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>% 
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = leiden3)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + ggtitle("rod")
```

## hclust
Take pseudobulk values (at the cluster level) and hierarchically cluster them to ensure 
there aren't any issues in either the overall structure (e.g. rod and cones are intersperse)d
and/or to identify any potential mislabeled clusters

```{r, fig.width = 18, fig.height = 10}
library(tidyverse)
pb <- data.table::fread('~/data/scEiaD_modeling/hs111_mature_eye_full/hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.pseudoBulk.leiden3.csv.gz')
colnames(pb) <- gsub("\\.\\d+","",colnames(pb))
hvg <- data.table::fread('~/data/scEiaD_modeling/hs111_mature_eye_full/hvg2000.csv.gz')[-1,]
rnames <- pb$V1
clust <- str_extract(rnames, '\\d+') %>% as.integer()
pb <- pb[,-1] %>% as.matrix()
row.names(pb) <- as.character(clust)
pb <- pb[as.character(obs$labels$leiden3),]

# library(SingleCellExperiment)
# library(scater)
# library(scran)
# do_hvg <- function(cts){
#   sce <- SingleCellExperiment(list(counts = cts))
#   sce <- logNormCounts(sce)
#   hvg <- modelGeneVar(sce)
#   # Visualizing the fit:
#   hvg$var <- metadata(hvg)$var
#   return(hvg)
# }
# 
# nhvg  <- do_hvg(t(pb))

pb_norm <- metamoRph::normalize_data(t(pb), sample_scale = 'cpm') %>% t() 

# remove cell cycle genes
conv_table <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db, 
                                    keys=gsub('\\.\\d+','',unique(colnames(pb_norm))),
                                    columns=c("ENSEMBL","SYMBOL", "MAP","GENENAME", "ENTREZID"), keytype="ENSEMBL")

cc_genes <- hvg %>% mutate(ENSEMBL = gsub("\\.\\d+","",V2)) %>% 
  left_join(conv_table, by = "ENSEMBL") %>% 
  mutate(cc_genes = case_when(SYMBOL %in% (Seurat::cc.genes.updated.2019 %>% unlist()) ~ TRUE)) %>% 
  filter(cc_genes) %>% pull(V2)
ribo_genes <- hvg %>% mutate(ENSEMBL = gsub("\\.\\d+","",V2)) %>% 
  left_join(conv_table, by = "ENSEMBL") %>% filter(grepl("^RPL|^RPS|^MT",SYMBOL)) %>% 
  pull(SYMBOL)

pb_norm <- pb_norm[,hvg$V2]
#pb_norm <- pb_norm[,hvg$V2[!hvg$V2 %in% c(cc_genes,ribo_genes)]]
# https://stats.stackexchange.com/questions/31565/compute-a-cosine-dissimilarity-matrix-in-r
sim <- pb_norm / sqrt(rowSums(pb_norm * pb_norm))
sim <- sim %*% t(sim)
D_sim <- as.dist(1 - sim)

hclust_sim <- hclust(D_sim, method = 'average')

hclust_sim$labels <- obs$labels %>% pull(leiden3)

library(ggtree)
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels, by = c("label" = "leiden3")) %>% 
  mutate(techRatio = round(techRatio, digits = ))
p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studyCount, TotalCount, techRatio, sep = ' - '), color = mCT)) + 
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")

p <- ggtree(hclust_sim)
p$data <- p$data %>% 
  left_join(obs$labels %>% 
              mutate(studies = case_when(studyCount ==1 ~ studies,
                                         TRUE ~ "multiple")), by = c("label" = "leiden3")) %>% 
  mutate(class = case_when(mCT %in% c("rod","cone","retinal ganglion",
                                      "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural"))
p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = class)) + 
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")


p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels %>% mutate(studies = case_when(studyCount ==1 ~ studies,
                                                                         TRUE ~ "multiple")), by = c("label" = "leiden3")) 

p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = mCT)) + 
  geom_tippoint(aes(shape = studies), size= 3) +
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")



p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels %>% mutate(studies = case_when(studyCount ==1 ~ studies,
                                                                         TRUE ~ "multiple")), by = c("label" = "leiden3")) 

p + layout_dendrogram() +
  geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = mCT)) + 
  geom_tippoint(aes(shape = studies), size= 3) +
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  guides(color="none")


```

## cutree

hclust 2 and 3 are non-neural

1 is neural

```{r, fig.width = 18, fig.height = 10}
cutree_with_labels <- cutree(hclust_sim, k = 3) %>% enframe(name = 'leiden3', value = 'hclust_k3') %>% 
  mutate(leiden3 = as.integer(leiden3)) %>% 
  left_join(obs$labels) %>%   
  mutate(class = case_when(mCT %in% c("rod","cone","retinal ganglion",
                                      "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural")) 

p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(cutree_with_labels, by = c("label" = "leiden3"))

p + layout_dendrogram() +
  geom_tiplab(aes(color = as.factor(hclust_k3))) + 
  theme_dendrogram(plot.margin=margin(16,16,300,16)) +
  scale_color_manual(values = c(pals::alphabet(), pals::glasbey()) %>% unname())


cutree_with_labels %>% 
  group_by(hclust_k3, class) %>% 
  summarise(Count = n(), 
            leiden3 = paste(leiden3, collapse = ', '))



```


## Remove Clusters

Reasons:
- non-neural in hclust 1 
- neural in hclust 2 or 3
- SRP447468 "exclusive" neural clusters (as they seem to group with themselves instead of their respective cluster types)


```{r}
remove_leiden3 <- #c(71,93,126,112,144,39,21,85,135,7,74,36,124)
  # reason 1
  c(cutree_with_labels %>% filter(class == 'non-neural', hclust_k3 == 1) %>% pull(leiden3), 
    # reason 2
    cutree_with_labels %>% filter(class == 'neural', hclust_k3 %in% c(2,3)) %>% pull(leiden3),
    # reason 3
    c(153,131,150,62,4,100)
  )
```

## Matthew Brooks / Anand Swaroop Diff Markers (retina)
Another sanity check for some cell types

```{r}
labelled_diff_testing <- diff$diff_testing %>% 
  left_join(conv_table, by =c( "ENSEMBL")) %>% filter(SYMBOL %in% toupper(swarooks_markers$gene))

scored_labelled_diff_testing <- labelled_diff_testing %>% 
  left_join(swarooks_markers %>% mutate(gene = toupper(gene)), by = c("SYMBOL" = "gene")) %>% 
  group_by(base, cell_type) %>% 
  summarise(score = mean(scores)) 

scored_labelled_diff_testing %>% 
  pivot_wider(names_from = cell_type, values_from = score) %>% 
  dplyr::rename(leiden3 = base) %>% left_join(obs$labels %>% select(leiden3, mMCT)) %>% mutate(leiden3 = as.factor(leiden3)) %>%  DT::datatable(filter = 'top')
```

# Stage 4

Run the `scEiaD_modeling` again on the neural and non-neural CLUSTERS


## Neural

```{r}
set.seed(1364578)
neural_bcs <- obs$obs %>% 
  left_join(cutree_with_labels, by = 'leiden3') %>% 
  filter(!leiden3 %in% remove_leiden3) %>% 
  mutate(class = case_when(MCT_scANVI %in% c("rod","cone","retinal ganglion",
                                             "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural")) %>% 
  filter(class == 'neural')


neural_ref <- neural_bcs %>% 
  ungroup() %>% 
  group_by(study_accession, MCT_scANVI) %>%  
  #filter(tissue %in% c('Retina')) %>% 
  sample_n(2000, replace = TRUE) %>% 
  unique()

low_study_n <- neural_ref %>% group_by(study_accession) %>% count() %>% filter(n < 100)
neural_ref <- neural_ref %>% ungroup() %>% 
  filter(!study_accession %in% low_study_n$study_accession)

neural_query <- neural_bcs %>% 
  filter(!barcode %in% neural_ref$barcode)


#neural_ref$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_neural_ref_bcs.20240924-02.csv.gz'))
#neural_query$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_neural_query_bcs.20240924-02.csv.gz'))
```

### Follow up in Human_Mature_Eye_full__stage4_neural.Rmd
## Non-Neural

```{r}
set.seed(1364578)
nonneural_bcs <- obs$obs %>% 
  left_join(cutree_with_labels, by = 'leiden3') %>% 
  filter(!leiden3 %in% remove_leiden3) %>% 
  mutate(class = case_when(MCT_scANVI %in% c("rod","cone","retinal ganglion",
                                             "amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
                           TRUE ~ "non-neural")) %>% 
  filter(class == 'non-neural')


nonneural_ref <- nonneural_bcs %>% 
  ungroup() %>% 
  group_by(study_accession, MCT_scANVI) %>%  
  #filter(tissue %in% c('Retina')) %>% 
  sample_n(2000, replace = TRUE) %>% 
  unique()

low_study_n <- nonneural_ref %>% group_by(study_accession) %>% count() %>% filter(n < 100)

# remove non author labelled cells from SRP447468 as a ref
# this study keeps being an outlier - just want to use cells which passed their QC
nonneural_ref <- bind_rows(nonneural_ref %>% ungroup() %>% 
                             filter(!study_accession %in% low_study_n$study_accession,
                                    study_accession != 'SRP447468'),
                           nonneural_ref %>% ungroup() %>% 
                             filter(study_accession == 'SRP447468',
                                    MajorCellType != 'unlabelled')
)

nonneural_query <- nonneural_bcs %>% 
  filter(!barcode %in% nonneural_ref$barcode)


#nonneural_ref$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_NONneural_ref_bcs.20241001-02.csv.gz'))
#nonneural_query$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_NONneural_query_bcs.20241001-02.csv.gz'))
```

### Follow up in Human_Mature_Eye_full__stage4_NONneural.Rmd


# Stage 5

Load in new CT calls (made by hand in the `__stage4_` docs)

```{r}
load("Human_Mature_Eye_full__stage4_NONneural.obs.freeze20241112.Rdata")
load("Human_Mature_Eye_full__stage4_neural.obs.freeze20241112.Rdata")
obs$nobs <- bind_rows(obs_nonneural$nobs %>% 
                        mutate(CT_l3=  paste0(CT, ":", leiden3),
                               hrCT_l3 = paste0(hrCT, ":", leiden3)) %>% 
                        select(barcode, CT, hrCT,CT_l3, hrCT_l3, suspect) %>% 
                        mutate(origin = 'nonneural') %>% 
                        left_join(obs$obs, by = 'barcode'),
                      obs_neural$nobs %>% 
                        mutate(CT_l3=  paste0(CT, ":", leiden3),
                               hrCT_l3 = paste0(hrCT, ":", leiden3)) %>% 
                        select(barcode, CT, hrCT,CT_l3, hrCT_l3, suspect) %>% 
                        mutate(origin = 'neural') %>% 
                        left_join(obs$obs, by = 'barcode'))
obs$nlabels <- bind_rows(obs_nonneural$nlabels %>% mutate(leiden3 = paste0(leiden3, ':nn')),
                         obs_neural$nlabels %>% mutate(leiden3 = paste0(leiden3, ':ne'))) %>% select(-newCT) %>% 
  relocate(leiden3, CT, hrCT)

# label leiden3 with >90% suspect labels
sus_stage3_l3 <- obs$nobs %>% 
  group_by(leiden3, suspect) %>% 
  summarise(Count = n()) %>% 
  mutate(Ratio = Count/sum(Count)) %>% 
  filter(suspect) %>% 
  arrange(-Ratio) %>% 
  filter(Ratio > 0.9)

obs$nobs <- obs$nobs %>% mutate(suspect2 = case_when(leiden3 %in% sus_stage3_l3$leiden3 ~ TRUE,
                                                     suspect ~ TRUE,
                                                     !suspect ~ FALSE)) %>% 
  mutate(suspect = suspect2) %>% 
  select(-suspect2)


```

```{r, fig.width=16, fig.height=16}
obs$nobs %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = CT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(CT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = CT, color = CT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") +
  facet_wrap(~suspect)

obs$nobs %>% 
  filter(!suspect) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = hrCT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(hrCT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = hrCT, color = hrCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") 

obs$nobs %>% 
  filter(!suspect) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = CT),pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(hrCT_l3, CT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = hrCT_l3,color = CT),
                            max.overlaps = Inf) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey(), pals::alphabet(), pals::watlington(),
                                pals::trubetskoy(), pals::tableau20(), pals::stepped(),
                                pals::polychrome(), pals::okabe()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") 

obs$nobs %>%
  filter(!suspect) %>% 
  ggplot(aes(x=umap1,y=umap2)) +
  scattermore::geom_scattermore(aes(color = hrCT), pointsize = 0.8, alpha = 0.5) +
  ggrepel::geom_label_repel(data = . %>% group_by(CT, hrCT) %>%
                              summarise(umap1 = median(umap1),
                                        umap2 = median(umap2)),
                            aes(label = hrCT, color = hrCT)) +
  scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) + 
  cowplot::theme_cowplot() + theme(legend.position = "none") +
  facet_wrap(~CT)

```


# Counts
```{r, fig.width=10}
mct <- obs$obs %>% 
  group_by(MajorCellType, study_accession, tissue) %>% 
  filter(MajorCellType != '', MajorCellType != 'unlabelled', MajorCellType != 'rpc') %>% 
  summarize(Count = n()) %>% 
  rename(CT = MajorCellType) %>% 
  mutate(CT = case_when(CT == 'oligo' ~ 'oligodendrocyte',
                        CT == 'sphincter' ~ 'muscle',
                        CT == 'fiber' ~ 'fibroblast',
                        TRUE ~ CT)) %>% 
  mutate(origin = case_when(CT %in% c("amacrine",
                                      "bipolar",
                                      "rod",
                                      "cone",
                                      "horizontal",
                                      "retinal ganglion") ~ 'neural',
                            TRUE ~ 'nonneural')) %>% 
  mutate(Class = 'Author Major Cell Type')

mMCT <- obs$nobs %>% 
  filter(!suspect) %>% 
  group_by(CT, study_accession, tissue, origin) %>% 
  summarize(Count = n()) %>% 
  mutate(Class = 'ML Major Cell Type')

bind_rows(mct,
          mMCT) %>% 
  filter(origin == 'neural') %>% 
  ggplot(aes(y=CT,x=Count,fill=study_accession)) +
  geom_bar(stat='identity') +
  ggforce::facet_row(~Class, scales = 'free') +
  cowplot::theme_cowplot() +
  scale_fill_manual(values = pals::kelly()[2:20])

bind_rows(mct,
          mMCT) %>% 
  filter(origin == 'nonneural') %>% 
  ggplot(aes(y=CT,x=Count,fill=study_accession)) +
  geom_bar(stat='identity') +
  ggforce::facet_row(~Class) +
  cowplot::theme_cowplot() +
  scale_fill_manual(values = pals::kelly()[2:20])

```


# Outputs
```{r}
# save(obs, file = 'Human_Mature_BackEye.freeze20241112.obs.Rdata')
# obs$nobs %>% write_csv("Human_Mature_BackEye.freeze20241112.nobs.csv.gz")
# obs$nlabels %>% write_csv("Human_Mature_BackEye.freeze20241112.nlabels.csv.gz")
```

# Stage 6
Run mouse and chick (adult / not fetal) data against the Stage 4 nonneural and neural scANVI models.

Pondering how to best do this....

Parameters used (Stage 2 ran a bunch, in Stage 3 picked this one): 2000hvg 200e 30l

I have three models (on biowulf2):

  - Stage 2 (all)
  - Stage 4 (nonneural and neural)
  
Three approaches for new data:
  1. Run against the Stage 2 model
    - downsides: 
      - contains a lot of (later) removed cells (or is this an upside?)
      - built against 
    
```{r}
sessionInfo()
```